#!/usr/bin/perl -w
use strict;
use warnings;
use DBI;
use LWP::Protocol::https;
use LWP::UserAgent;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use Encode;
use utf8;
binmode(STDIN,':encoding(utf8)');
binmode(STDOUT,':encoding(utf8)');
binmode(STDERR,':encoding(utf8)'); 
use Time::Local;
use Date::Parse;

&insert_news(1);

sub insert_news{
    my $page= $_[0];
	my $flag=0;
    my $count;
	my $driver="DBI:mysql";
	my $database="news";
	my $host="localhost";
	my $user="root";
	my $password="192510";
	my $db=DBI->connect("$driver:database=$database;host=$host;user=$user;password=$password");
	
	
	#得到前几天的时间
	my $current_time=$db->prepare("select UNIX_TIMESTAMP(date_sub(curdate(),interval 7 day))");
	$current_time->execute();
	my @current_time2=$current_time->fetchrow_array;
	my $current_time3=$current_time2[0];
	

	
	my $url="https://www.vulbox.com/board/internet/page/$page";
	my ($ua, $request , $response , $content);
	$ua = LWP::UserAgent->new;
	my $agentstr=&randomagent;
	$ua->agent('User-Agent'=>$agentstr);
	$request = HTTP::Request->new('GET',$url);
	$response = $ua->request($request);
	if($response->code ne '200'){
	    
	}else{
	        $content = $response->content;
	        my $html=decode("utf-8",$content);
			
			my @array=split(/<div class=\"bugs-info\"\>/,$html);

			#得到每一段
			my $arr_num = scalar @array;
			for(my $i=1;$i<$arr_num;$i++){
				   
				 my $arr_quan=$array[$i];#arr_quan里面有所有需要的内容
				 
				 #得到href
				 my @array2=split(/<a target\=\"_blank\" href=/,$arr_quan);
				 #my $arr_num2= scalar @array2;#长度为4
				 
				 my $href_name=$array2[1];#包含了href与名称 
				 #分出href与name
				 my @href_name2=split(/<\/a>/,$href_name);
				 my @href_name3=split(/\>/,$href_name2[0]);
				 #link
				 my $href=$href_name3[0];
				 $href=~ s/\"//g;
				 my $rel_href="https://www.vulbox.com$href";
				 #标题
				 my $name=$href_name3[1];
				 
				 my $author_time=$array2[3];#包含了作者和时间
				 my @author_time2=split(/<span class=\"bugs-time\">提交时间：/,$author_time);
				 #作者
				 my $author=$author_time2[0];
				 my @author2=split(/<b>/,$author);
				 my @author3=split(/<\/b>/,$author2[1]);
				 my $rel_author=$author3[0];
				 #时间
				 my $publishtime=$author_time2[1];
				 my @publishtime2=split(/<\/span>/,$publishtime);
				 my $rel_publishtime=$publishtime2[0];
				
				 my $rel_content="无";
				 my $web=1;
				 my $m=$name.$rel_author;
				 Encode::_utf8_off($m);
				 my $md5=md5($m);
				 
				 my $bj_publishtime=str2time("$rel_publishtime GMT");
				  
				 if($bj_publishtime>=$current_time3){
					 #开始入库
					 #先去掉变量的空格在判断
					 my $q_name=$name;
					 $q_name=~s/\s//g;
						 
					 my $q_rel_href=$rel_href;
					 $q_rel_href=~s/\s//g;
						 
					 my $q_rel_author=$rel_author;
					 $q_rel_author=~s/\s//g;
						 
					 if($q_name ne"" and $q_rel_href ne"" and $q_rel_author ne""){
						my $sth = $db->prepare(q{insert ignore into news(title,web,author,publishtime,content,link,nmd5)VALUES(?,?,?,?,?,?,?)}) or die $db->errstr;
						$sth->execute($name,$web,$rel_author,$rel_publishtime,$rel_content,$rel_href,$md5) or die $db->errstr; 
					 }
				  
				 }else{
					 return;
				 }
				 
			 }
			   sleep(1);
			   $page=$page+1;
			   &insert_news($page);
	
	}
	

}

#模拟随机浏览器
sub randomagent
{
## FireFox
# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0
# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1
# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1
# Chrome
# Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19 QIHU 360EE
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
##Safari
# Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13
##Sogou
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
##Baidu
# Mozilla/5.0 (Windows; U; Windows NT 6.1; zh_CN) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/18.0 BIDUBrowser/2.6 Safari/534.7
##QQ
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.7 (KHTML, like Gecko) Chrome/20.0.1099.0 Safari/536.7 QQBrowser/6.14.15138.201
##Maxthon
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.12 (KHTML, like Gecko) Maxthon/3.4.2.3000 Chrome/18.0.966.0 Safari/535.12
##LBBROWSER
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER
##TheWorld
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11 QIHU THEWORLD
	my @agent_conf=('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0',
	'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1',
	'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19 QIHU 360EE',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
	'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
	'Mozilla/5.0 (Windows; U; Windows NT 6.1; zh_CN) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/18.0 BIDUBrowser/2.6 Safari/534.7',
	'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.12 (KHTML, like Gecko) Maxthon/3.4.2.3000 Chrome/18.0.966.0 Safari/535.12',
	'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
	'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11 QIHU THEWORLD');
	my $int=int(rand(12));
	return $agent_conf[$int];
}

